Disponível em: http://insideairbnb.com/get-the-data.html Referencia para seguir: https://www.kaggle.com/josipdomazet/mining-nyc-airbnb-data-using-r
Download:
Importa o dataset
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## Parsed with column specification:
## cols(
## .default = col_character(),
## id = col_double(),
## scrape_id = col_double(),
## last_scraped = col_date(format = ""),
## thumbnail_url = col_logical(),
## medium_url = col_logical(),
## xl_picture_url = col_logical(),
## host_id = col_double(),
## host_since = col_date(format = ""),
## host_is_superhost = col_logical(),
## host_listings_count = col_double(),
## host_total_listings_count = col_double(),
## host_has_profile_pic = col_logical(),
## host_identity_verified = col_logical(),
## neighbourhood_group_cleansed = col_logical(),
## latitude = col_double(),
## longitude = col_double(),
## is_location_exact = col_logical(),
## accommodates = col_double(),
## bathrooms = col_double(),
## bedrooms = col_double()
## # ... with 40 more columns
## )
## See spec(...) for full column specifications.
## Warning: 3 parsing failures.
## row col expected actual file
## 1745 license 1/0/T/F/TRUE/FALSE +1512-6670366 <connection>
## 28567 license 1/0/T/F/TRUE/FALSE 56131/AL <connection>
## 34253 license 1/0/T/F/TRUE/FALSE 05.557.336/0001-70 <connection>
## # A tibble: 35,451 x 17
## name host_name neighbourhood_c… latitude longitude property_type
## <chr> <fct> <fct> <dbl> <dbl> <fct>
## 1 Very… Matthias Copacabana -23.0 -43.2 Condominium
## 2 Beau… Viviane Copacabana -23.0 -43.2 Apartment
## 3 NICE… Renata Ipanema -23.0 -43.2 Apartment
## 4 Cosy… Patricia Ipanema -23.0 -43.2 Apartment
## 5 COPA… Patricia… Copacabana -23.0 -43.2 Loft
## 6 Copa… Seba Copacabana -23.0 -43.2 Apartment
## 7 Beac… Alex Ipanema -23.0 -43.2 Serviced apa…
## 8 Rio … Vana Copacabana -23.0 -43.2 Apartment
## 9 4bed… Marcio Copacabana -23.0 -43.2 Apartment
## 10 HUma… Marcio Humaitá -23.0 -43.2 Apartment
## # … with 35,441 more rows, and 11 more variables: room_type <fct>,
## # price <dbl>, accommodates <dbl>, bedrooms <dbl>, minimum_nights <dbl>,
## # availability_365 <dbl>, number_of_reviews <dbl>,
## # review_scores_rating <dbl>, cancellation_policy <fct>,
## # require_guest_profile_picture <lgl>,
## # require_guest_phone_verification <lgl>
airbnb %>% summary()
## name host_name neighbourhood_cleansed
## Length:35451 Daniel : 432 Copacabana : 8825
## Class :character Ricardo: 322 Barra da Tijuca : 3908
## Mode :character Maria : 315 Ipanema : 2970
## Marcelo: 311 Jacarepaguá : 1917
## Mario : 309 Botafogo : 1767
## (Other):33702 Recreio dos Bandeirantes: 1750
## NA's : 60 (Other) :14314
## latitude longitude property_type
## Min. :-23.07 Min. :-43.74 Apartment :27023
## 1st Qu.:-22.98 1st Qu.:-43.32 House : 3709
## Median :-22.97 Median :-43.20 Condominium : 1618
## Mean :-22.96 Mean :-43.25 Serviced apartment: 903
## 3rd Qu.:-22.94 3rd Qu.:-43.19 Loft : 653
## Max. :-22.75 Max. :-43.10 Bed and breakfast : 281
## (Other) : 1264
## room_type price accommodates
## Entire home/apt:25006 Min. : 0.0 Min. : 1.000
## Private room : 9586 1st Qu.: 150.0 1st Qu.: 2.000
## Shared room : 859 Median : 281.0 Median : 4.000
## Mean : 622.2 Mean : 4.175
## 3rd Qu.: 599.0 3rd Qu.: 5.000
## Max. :40000.0 Max. :160.000
##
## bedrooms minimum_nights availability_365 number_of_reviews
## Min. : 0.000 Min. : 1.000 Min. : 0.0 Min. : 0.000
## 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 0.0 1st Qu.: 0.000
## Median : 1.000 Median : 2.000 Median :179.0 Median : 1.000
## Mean : 1.637 Mean : 4.736 Mean :190.1 Mean : 7.952
## 3rd Qu.: 2.000 3rd Qu.: 4.000 3rd Qu.:362.0 3rd Qu.: 5.000
## Max. :22.000 Max. :1123.000 Max. :365.0 Max. :350.000
## NA's :23
## review_scores_rating cancellation_policy
## Min. : 20.00 flexible :15552
## 1st Qu.: 93.00 moderate : 5757
## Median : 98.00 strict : 2
## Mean : 94.35 strict_14_with_grace_period:13589
## 3rd Qu.:100.00 super_strict_30 : 163
## Max. :100.00 super_strict_60 : 388
## NA's :17455
## require_guest_profile_picture require_guest_phone_verification
## Mode :logical Mode :logical
## FALSE:34873 FALSE:34850
## TRUE :578 TRUE :601
##
##
##
##
glimpse(airbnb)
## Observations: 35,451
## Variables: 17
## $ name <chr> "Very Nice 2Br - Copacabana - W…
## $ host_name <fct> Matthias, Viviane, Renata, Patr…
## $ neighbourhood_cleansed <fct> Copacabana, Copacabana, Ipanema…
## $ latitude <dbl> -22.96592, -22.97712, -22.98302…
## $ longitude <dbl> -43.17896, -43.19045, -43.21427…
## $ property_type <fct> Condominium, Apartment, Apartme…
## $ room_type <fct> Entire home/apt, Entire home/ap…
## $ price <dbl> 296, 161, 243, 337, 221, 150, 3…
## $ accommodates <dbl> 5, 3, 3, 3, 2, 2, 13, 1, 11, 3,…
## $ bedrooms <dbl> 2, 1, 1, 1, 1, 1, 6, 1, 4, 1, 1…
## $ minimum_nights <dbl> 4, 4, 2, 2, 3, 2, 2, 3, 4, 5, 3…
## $ availability_365 <dbl> 332, 352, 125, 122, 145, 89, 29…
## $ number_of_reviews <dbl> 233, 232, 260, 160, 303, 1, 54,…
## $ review_scores_rating <dbl> 93, 94, 96, 94, 98, NA, 91, 98,…
## $ cancellation_policy <fct> strict_14_with_grace_period, st…
## $ require_guest_profile_picture <lgl> FALSE, TRUE, FALSE, TRUE, FALSE…
## $ require_guest_phone_verification <lgl> FALSE, TRUE, FALSE, TRUE, TRUE,…
Missing antes de remover
missing_airbnb <- summarise_all(airbnb, ~sum(is.na(.)))
missing_airbnb <- gather(missing_airbnb, key = "variables", value = "missing")
missing_airbnb %>% filter(missing > 0)
## # A tibble: 4 x 2
## variables missing
## <chr> <int>
## 1 name 66
## 2 host_name 60
## 3 bedrooms 23
## 4 review_scores_rating 17455
# Remove sem reviews
airbnb <- airbnb %>% filter(number_of_reviews != 0)
# Remove preço 0
airbnb <- airbnb %>% filter(price != 0)
# Remove os NA
airbnb <- airbnb %>% drop_na(review_scores_rating)
airbnb <- airbnb %>% drop_na(bedrooms)
glimpse(airbnb)
## Observations: 17,979
## Variables: 17
## $ name <chr> "Very Nice 2Br - Copacabana - W…
## $ host_name <fct> Matthias, Viviane, Renata, Patr…
## $ neighbourhood_cleansed <fct> Copacabana, Copacabana, Ipanema…
## $ latitude <dbl> -22.96592, -22.97712, -22.98302…
## $ longitude <dbl> -43.17896, -43.19045, -43.21427…
## $ property_type <fct> Condominium, Apartment, Apartme…
## $ room_type <fct> Entire home/apt, Entire home/ap…
## $ price <dbl> 296, 161, 243, 337, 221, 3250, …
## $ accommodates <dbl> 5, 3, 3, 3, 2, 13, 1, 11, 4, 6,…
## $ bedrooms <dbl> 2, 1, 1, 1, 1, 6, 1, 4, 1, 3, 4…
## $ minimum_nights <dbl> 4, 4, 2, 2, 3, 2, 3, 4, 3, 2, 1…
## $ availability_365 <dbl> 332, 352, 125, 122, 145, 298, 3…
## $ number_of_reviews <dbl> 233, 232, 260, 160, 303, 54, 40…
## $ review_scores_rating <dbl> 93, 94, 96, 94, 98, 91, 98, 80,…
## $ cancellation_policy <fct> strict_14_with_grace_period, st…
## $ require_guest_profile_picture <lgl> FALSE, TRUE, FALSE, TRUE, FALSE…
## $ require_guest_phone_verification <lgl> FALSE, TRUE, FALSE, TRUE, TRUE,…
Missing após remover
missing_airbnb <- summarise_all(airbnb, ~sum(is.na(.)))
missing_airbnb <- gather(missing_airbnb, key = "variables", value = "missing")
missing_airbnb %>% filter(missing > 0)
## # A tibble: 2 x 2
## variables missing
## <chr> <int>
## 1 name 2
## 2 host_name 40
n_bairros <- 7
bairros <- airbnb %>%
group_by(neighbourhood_cleansed) %>%
tally(sort=TRUE) %>%
group_by(bairro = factor(c(
as.character(neighbourhood_cleansed[1:n_bairros]), rep("Outros", n() - n_bairros)),
levels = c(as.character(neighbourhood_cleansed[1:n_bairros]), "Outros"))) %>%
tally(n)
bairros %>%
ggplot(aes(bairro, n, fill=bairro)) +
geom_bar(stat="identity") +
geom_text(aes(label=n), vjust=-0.4, size=3.5) +
theme(legend.position = "none") +
xlab("Bairro") +
ylab("Frquência")
ggplot(airbnb, aes(x=room_type, fill=room_type)) +
geom_bar() +
geom_text(stat='count', aes(label=..count..), vjust=-0.4, size=3.5)
n_tipos <- 6
tipos_propriedade <- airbnb %>%
group_by(property_type) %>%
tally(sort=TRUE) %>%
group_by(tipo_propriedade = factor(c(
as.character(property_type[1:n_tipos]), rep("Outros", n() - n_tipos)),
levels = c(as.character(property_type[1:n_tipos]), "Outros"))) %>%
tally(n)
tipos_propriedade %>%
ggplot(aes(tipo_propriedade, n, fill=tipo_propriedade)) +
geom_bar(stat="identity") +
geom_text(aes(label=n), vjust=-0.4, size=3.5) +
xlab("Tipo de propriedade") +
ylab("Frequência") +
theme(axis.text = element_blank())
airbnb %>%
group_by(cancellation_policy) %>%
tally(sort=TRUE) %>%
ggplot(aes(x=reorder(cancellation_policy, -n), y=n, fill=reorder(cancellation_policy, -n))) +
geom_bar(stat="identity") +
geom_text(aes(label=n), vjust=-0.4, size=3.5) +
theme(axis.text.x = element_blank()) +
xlab("Política de cancelamento") +
ylab("Frequência") +
labs(fill="Política de cancelamento")
# Tema para pie charts
blank_theme <- theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.x=element_blank(),
panel.border = element_blank(),
panel.grid=element_blank(),
axis.ticks = element_blank(),
plot.title=element_text(size=14, face="bold")
)
airbnb %>%
ggplot(aes(x="", fill=require_guest_profile_picture)) +
geom_bar(width=1) +
coord_polar("y", start=0) +
blank_theme +
geom_text(stat='count',aes(label=..count..), position = position_stack(vjust = 0.5), color="white") +
labs(fill="") +
ggtitle("Requer foto de perfil do hóspede")
airbnb %>%
ggplot(aes(x="", fill=require_guest_phone_verification)) +
geom_bar(width=1) +
coord_polar("y", start=0) +
blank_theme +
geom_text(stat='count',aes(label=..count..), position = position_stack(vjust = 0.5), color="white") +
labs(fill="") +
ggtitle("Requer que o hóspede tenha telefone verificado")
airbnb$accommodates %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 4.000 3.988 5.000 160.000
airbnb %>%
filter(accommodates < 50) %>%
ggplot(aes(x = "", y = accommodates)) +
geom_boxplot(fill="purple") +
xlab("") +
ylab("Número") +
ggtitle("Quantidade máxima de pessoas acomodadas",
subtitle = "Removido um valor 160") +
theme(legend.position = "none")
airbnb %>%
filter(accommodates < 50) %>%
ggplot(aes(accommodates)) +
geom_histogram(bins = 31, fill="purple") +
xlab("") +
ylab("Frequência") +
ggtitle("Quantidade máxima de pessoas acomodadas",
subtitle = "Removido um valor 160") +
theme(legend.position = "none")
airbnb$bedrooms %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 1.000 1.511 2.000 13.000
airbnb %>%
ggplot(aes(x = "", y = bedrooms)) +
geom_boxplot(fill="lightblue") +
xlab("") +
ylab("Número") +
ggtitle("Número de quartos") +
theme(legend.position = "none")
airbnb$bedrooms %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 1.000 1.511 2.000 13.000
airbnb %>%
ggplot(aes(bedrooms)) +
geom_histogram(bins=28, fill="lightblue") +
xlab("Quartos") +
ylab("Frequência") +
ggtitle("Número de quartos") +
theme(legend.position = "none")
airbnb$minimum_nights %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 2.000 3.698 3.000 1123.000
airbnb %>%
ggplot(aes(x = "", y = minimum_nights)) +
geom_boxplot(fill="lightblue") +
scale_y_log10() +
xlab("") +
ylab("Noites") +
ggtitle("Número mínimo de noites", subtitle = "Com escala logarítmica") +
theme(legend.position = "none")
airbnb %>%
# filter(minimum_nights < 50) %>%
ggplot(aes(minimum_nights)) +
geom_histogram(bins=28, fill="lightgreen") +
xlab("Noites") +
ylab("Frequência") +
ggtitle("Número mínimo de noites") +
theme(legend.position = "none")
airbnb %>%
filter(minimum_nights <= 20) %>%
ggplot(aes(minimum_nights)) +
geom_histogram(bins=19, fill="lightgreen") +
xlab("Noites") +
ylab("Frequência") +
ggtitle("Número mínimo de noites", subtitle = "Removidos valores maiores que 20") +
theme(legend.position = "none")
airbnb$availability_365 %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 68.0 180.0 195.1 349.0 365.0
airbnb %>%
ggplot(aes(x = "", y = availability_365)) +
geom_boxplot(fill="blue") +
xlab("") +
ylab("Dias") +
ggtitle("Disponibilidade 365") +
theme(legend.position = "none")
airbnb %>%
ggplot(aes(availability_365)) +
geom_histogram(bins=28, fill="blue") +
xlab("Dias") +
ylab("Frequência") +
ggtitle("Disponibilidade 365") +
theme(legend.position = "none")
airbnb$number_of_reviews %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 2.00 4.00 15.61 15.00 350.00
airbnb %>%
ggplot(aes(x = "", y = number_of_reviews)) +
geom_boxplot(fill="pink") +
scale_y_log10() +
xlab("") +
ylab("Avaliações") +
ggtitle("Número de avaliações", subtitle = "Com escala logaritmica") +
theme(legend.position = "none")
airbnb %>%
filter(number_of_reviews <= 150) %>%
ggplot(aes(number_of_reviews)) +
geom_histogram(bins=30, fill="pink") +
xlab("Avaliações") +
ylab("Frequência") +
ggtitle("Número de avaliações", subtitle = "Filtrados valores acima de 150") +
theme(legend.position = "none")
airbnb$review_scores_rating %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.00 93.00 98.00 94.35 100.00 100.00
airbnb %>%
ggplot(aes(x = "", y = review_scores_rating)) +
geom_boxplot(fill="#18d9cc") +
xlab("") +
ylab("Avaliações") +
ggtitle("Número de avaliações") +
theme(legend.position = "none")
airbnb %>%
ggplot(aes(review_scores_rating)) +
geom_histogram(bins=50, fill="#18d9cc") +
xlab("Avaliação") +
ylab("Frequência") +
ggtitle("Avaliação") +
theme(legend.position = "none")
airbnb %>%
filter(review_scores_rating < 100) %>%
ggplot(aes(review_scores_rating)) +
geom_histogram(bins=50, fill="#18d9cc") +
xlab("Avaliação") +
ylab("Frequência") +
ggtitle("Avaliação", subtitle = "Removidas notas 100") +
theme(legend.position = "none")
airbnb %>%
mutate(notamaxima = (review_scores_rating == 100)) %>%
group_by(notamaxima) %>%
ggplot(aes(x="", fill=notamaxima)) +
geom_bar(width=1) +
coord_polar("y", start=0) +
blank_theme +
geom_text(stat="count", aes(label=..count..), position=position_stack(vjust = 0.5), color="white") +
labs(fill="Nota máxima") +
ggtitle("Nota máxima (100)")
ggplot(airbnb, aes(price, fill=room_type)) +
geom_histogram(bins = 30) +
#geom_density(alpha = 0.2, fill = "purple") +
ggtitle("Distribução de preço",
subtitle = "A distribuição é muito inclinada") +
theme(axis.title = element_text(), axis.title.x = element_text())
#geom_vline(xintercept = round(mean(airbnb$price), 2), size = 2, linetype = 3)
ggplot(airbnb, aes(price, fill=room_type)) +
geom_histogram(bins = 30) +
ggtitle("Distribuição transformada do preço",
subtitle = expression("Com uma transformação" ~'log'[10] ~ "do eixo x")) +
#theme(axis.title = element_text(), axis.title.x = element_text()) +
#geom_vline(xintercept = round(mean(airbnb$price), 2), size = 2, linetype = 3) +
scale_x_log10()
#annotate("text", x = 1800, y = 0.75,label = paste("Mean price = ", paste0(round(mean(airbnb$price), 2), "$")),
# color = "#32CD32", size = 8)
ggplot(airbnb, aes(price, fill=room_type)) +
geom_histogram(bins = 30, aes(y = ..density..), show.legend = FALSE) +
facet_wrap(~room_type) +
scale_x_log10()
ggplot(airbnb, aes(x = room_type, y = price)) +
geom_boxplot(aes(fill = room_type)) + scale_y_log10() +
xlab("Tipo de quarto") +
ylab("Preço") +
ggtitle("Boxplots de preço por tipo de quarto") +
geom_hline(yintercept = mean(airbnb$price), color = "purple", linetype = 2) +
theme(legend.position = "none")
library(corrplot)
## corrplot 0.84 loaded
airbnb_cor <- airbnb[, sapply(airbnb, is.numeric)]
airbnb_cor <- airbnb_cor[complete.cases(airbnb_cor), ]
correlation_matrix <- cor(airbnb_cor, method = "spearman")
corrplot(correlation_matrix, method = "color")
pal <- colorFactor(palette = c("red", "green", "blue", "purple", "yellow"), domain = airbnb$room_type)
leaflet(data = airbnb) %>%
addProviderTiles(providers$CartoDB.DarkMatterNoLabels) %>%
addCircleMarkers(~longitude,
~latitude,
color=~pal(room_type),
weight = 1,
radius=1,
fillOpacity = 0.1,
opacity = 1,
label = paste("Name:", airbnb$name)) %>% addLegend("bottomright", pal = pal, values = ~room_type,
title = "Room types",
opacity = 1)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(airbnb, x = ~longitude, y = ~latitude, z = ~price, color = ~room_type)
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode